Spatial Analysis and Statistical Modeling with R and spmodel - 2

2024 Society for Freshwater Science Conference

Ryan Hill

EPA (USA)

Michael Dumelle

EPA (USA)

2024-06-02

GIS in R

Maintaining all analyses within a single software (R) can greatly simplify your research workflow. In this section, we’ll cover the basics of doing GIS in R.

Goals and Motivation

  • Understand the main features and types of vector data.
  • Generate point data from a set of latitudes and longitudes, such as from fields sites.
  • Read, write, query, and manipulate vector data using the sf package.

Points, lines, and polygons

Figure 1: Vector data. Image from: https://earthdatascience.org/courses/earth-analytics/spatial-data-r/intro-vector-data-r/

Points, lines, and polygons

We can represent these features in R without actually using GIS packages.

library(tidyverse)
library(ggplot2)

id <- c(1:5)
cities <- c('Ashland','Corvallis','Bend','Portland','Newport')
longitude <- c(-122.699, -123.275, -121.313, -122.670, -124.054)
latitude <- c(42.189, 44.57, 44.061, 45.523, 44.652)
population <- c(20062, 50297, 61362, 537557, 9603)

oregon_cities <- data.frame(id, cities, longitude, latitude, population)

Points, lines, and polygons

ggplot(
  data = oregon_cities, 
  aes(x = longitude, y = latitude, size = population, label = cities)
) +
  geom_point() +
  geom_text(hjust = 1, vjust = 1) +
  theme_bw()

Points, lines, and polygons

Figure 2: Oregon cities plotted from data frame.

Points, lines, and polygons

So, is this sufficient for working with spatial data in R and doing spatial analysis? What are we missing?

If you have worked with vector data before, you may know that these data also usually have:

  • A coordinate reference system
  • A bounding box or extent
  • Plot order
  • Additional data

Exploring the Simple Features (sf) package

  • The sf package provides simple features access for R
  • sf fits in within the “tidy” approach to data of Hadley Wickham’s tidyverse
  • In short, much of what used to require ArcGIS license can now be done in R with sf

Exploring the Simple Features (sf) package

library(sf)
ls("package:sf")
  [1] "%>%"                          "as_Spatial"                  
  [3] "dbDataType"                   "dbWriteTable"                
  [5] "gdal_addo"                    "gdal_create"                 
  [7] "gdal_crs"                     "gdal_extract"                
  [9] "gdal_inv_geotransform"        "gdal_metadata"               
 [11] "gdal_polygonize"              "gdal_rasterize"              
 [13] "gdal_read"                    "gdal_read_mdim"              
 [15] "gdal_subdatasets"             "gdal_utils"                  
 [17] "gdal_write"                   "gdal_write_mdim"             
 [19] "get_key_pos"                  "NA_agr_"                     
 [21] "NA_bbox_"                     "NA_crs_"                     
 [23] "NA_m_range_"                  "NA_z_range_"                 
 [25] "plot_sf"                      "rawToHex"                    
 [27] "read_sf"                      "sf.colors"                   
 [29] "sf_add_proj_units"            "sf_extSoftVersion"           
 [31] "sf_proj_info"                 "sf_proj_network"             
 [33] "sf_proj_pipelines"            "sf_proj_search_paths"        
 [35] "sf_project"                   "sf_use_s2"                   
 [37] "st_agr"                       "st_agr<-"                    
 [39] "st_area"                      "st_as_binary"                
 [41] "st_as_grob"                   "st_as_s2"                    
 [43] "st_as_sf"                     "st_as_sfc"                   
 [45] "st_as_text"                   "st_axis_order"               
 [47] "st_bbox"                      "st_bind_cols"                
 [49] "st_boundary"                  "st_break_antimeridian"       
 [51] "st_buffer"                    "st_can_transform"            
 [53] "st_cast"                      "st_centroid"                 
 [55] "st_collection_extract"        "st_combine"                  
 [57] "st_concave_hull"              "st_contains"                 
 [59] "st_contains_properly"         "st_convex_hull"              
 [61] "st_coordinates"               "st_covered_by"               
 [63] "st_covers"                    "st_crop"                     
 [65] "st_crosses"                   "st_crs"                      
 [67] "st_crs<-"                     "st_delete"                   
 [69] "st_difference"                "st_dimension"                
 [71] "st_disjoint"                  "st_distance"                 
 [73] "st_drivers"                   "st_drop_geometry"            
 [75] "st_equals"                    "st_equals_exact"             
 [77] "st_filter"                    "st_geometry"                 
 [79] "st_geometry_type"             "st_geometry<-"               
 [81] "st_geometrycollection"        "st_graticule"                
 [83] "st_inscribed_circle"          "st_interpolate_aw"           
 [85] "st_intersection"              "st_intersects"               
 [87] "st_is"                        "st_is_empty"                 
 [89] "st_is_longlat"                "st_is_simple"                
 [91] "st_is_valid"                  "st_is_within_distance"       
 [93] "st_jitter"                    "st_join"                     
 [95] "st_layers"                    "st_length"                   
 [97] "st_line_interpolate"          "st_line_merge"               
 [99] "st_line_project"              "st_line_sample"              
[101] "st_linestring"                "st_m_range"                  
[103] "st_make_grid"                 "st_make_valid"               
[105] "st_minimum_rotated_rectangle" "st_multilinestring"          
[107] "st_multipoint"                "st_multipolygon"             
[109] "st_nearest_feature"           "st_nearest_points"           
[111] "st_node"                      "st_normalize"                
[113] "st_overlaps"                  "st_perimeter"                
[115] "st_point"                     "st_point_on_surface"         
[117] "st_polygon"                   "st_polygonize"               
[119] "st_precision"                 "st_precision<-"              
[121] "st_read"                      "st_read_db"                  
[123] "st_relate"                    "st_reverse"                  
[125] "st_sample"                    "st_segmentize"               
[127] "st_set_agr"                   "st_set_crs"                  
[129] "st_set_geometry"              "st_set_precision"            
[131] "st_sf"                        "st_sfc"                      
[133] "st_shift_longitude"           "st_simplify"                 
[135] "st_snap"                      "st_sym_difference"           
[137] "st_touches"                   "st_transform"                
[139] "st_triangulate"               "st_triangulate_constrained"  
[141] "st_union"                     "st_viewport"                 
[143] "st_voronoi"                   "st_within"                   
[145] "st_wrap_dateline"             "st_write"                    
[147] "st_write_db"                  "st_z_range"                  
[149] "st_zm"                        "vec_cast.sfc"                
[151] "vec_ptype2.sfc"               "write_sf"                    

Exploring the Simple Features (sf) package

  • Convert the existing oregon cities data frame to a simple feature by:
  • Supplying the longitude and latitude (x and y).
  • Defining the (geographic) coordinate reference system (crs).

Exploring the Simple Features (sf) package

oregon_cities <- oregon_cities %>%
  st_as_sf(coords = c('longitude', 'latitude'), crs = 4269)
print(oregon_cities)
Simple feature collection with 5 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -124.054 ymin: 42.189 xmax: -121.313 ymax: 45.523
Geodetic CRS:  NAD83
  id    cities population                geometry
1  1   Ashland      20062 POINT (-122.699 42.189)
2  2 Corvallis      50297  POINT (-123.275 44.57)
3  3      Bend      61362 POINT (-121.313 44.061)
4  4  Portland     537557  POINT (-122.67 45.523)
5  5   Newport       9603 POINT (-124.054 44.652)

Exploring the Simple Features (sf) package

The oregon_cities object has now changed from being a standard data frame and includes features that are required for a true spatial object.

  • Geometry type
  • Bounding box
  • Coordinate reference system

Latitude and longitude columns have been moved to a new column called "geometry" (sticky).

Coordinate Reference Systems

sf also has functionality to re-project and manipulate spatial objects.

Figure 3: Image from: https://nceas.github.io/oss-lessons/spatial-data-gis-law/1-mon-spatial-data-intro.html

Coordinate Reference Systems

oregon_cities is currently in degrees, but certain applications may require an equal area projection and length units, such as meters. See: epsg.org

With sf we can:

  • Check to see if the current CRS is equal to the Albers Equal-Area Conic Projection
  • Transform oregon_cities to CRS 5070

Coordinate Reference Systems

st_crs(oregon_cities) == st_crs(5070)
[1] FALSE
oregon_cities <- 
  oregon_cities %>% 
  st_transform(crs = 5070)

Coordinate Reference Systems

Now let’s plot the the transformed data…

ggplot(data = oregon_cities) +
  geom_sf_text(aes(label = cities),
               hjust=0, vjust=1.5) +
  geom_sf(aes(size = population)) + 
  xlab('Longitude') +
  ylab('Latitude') +
  theme_bw()

Coordinate Reference Systems

Figure 4: Oregon cities plotted in Albers Equal Area Projection.

It all feels like R

  • There can be huge advantages to doing GIS tasks in R without going back and forth to other GIS software
  • If you are familiar with R, the leap to doing GIS here can be small
  • sf provides a large number of GIS functions, such as buffers, intersection, centroids, etc.

It all feels like R

Example 1: Add 100 Km buffer to cities

cities_buffer <- 
  oregon_cities %>% 
  st_buffer(100000)

Plot map of buffered cities…

ggplot(data = cities_buffer) +
  geom_sf(aes(fill = cities), alpha = 0.5) +
  geom_sf(data = st_centroid(oregon_cities)) +
  theme_bw()

It all feels like R

Buffered cities w/ overlapping buffers.

It all feels like R

Example 2: Split buffers into sub-units and calculate areas

cities_buffer <- cities_buffer %>% 
  st_buffer(100000) %>%
  st_intersection() %>%
  mutate(area = st_area(.) %>% 
           units::drop_units(),
         id = as.factor(1:nrow(.)))

Plot results:

ggplot(data = cities_buffer) +
  geom_sf(aes(fill = id), alpha = 0.5) +
  theme_bw()

It all feels like R

Buffered cities w/ overlapping buffers split.

Raster data

  • Another fundamental data type in GIS is the raster
  • Rasters are a way of displaying gridded data, where each member of the grid represents a landscape feature (e.g., elevation)
  • The terra package is now the best package for working with rasters

Raster data

Much like sf, terra has a large number of functions for working with raster data.

library(terra)
ls("package:terra")
  [1] "%in%"                  "activeCat"             "activeCat<-"          
  [4] "add_box"               "add_legend"            "add<-"                
  [7] "addCats"               "adjacent"              "aggregate"            
 [10] "align"                 "all.equal"             "allNA"                
 [13] "animate"               "app"                   "approximate"          
 [16] "area"                  "Arith"                 "as.array"             
 [19] "as.bool"               "as.contour"            "as.data.frame"        
 [22] "as.factor"             "as.int"                "as.lines"             
 [25] "as.list"               "as.matrix"             "as.points"            
 [28] "as.polygons"           "as.raster"             "atan_2"               
 [31] "atan2"                 "autocor"               "barplot"              
 [34] "blocks"                "boundaries"            "boxplot"              
 [37] "buffer"                "cartogram"             "catalyze"             
 [40] "categories"            "cats"                  "cbind2"               
 [43] "cellFromRowCol"        "cellFromRowColCombine" "cellFromXY"           
 [46] "cells"                 "cellSize"              "centroids"            
 [49] "clamp"                 "clamp_ts"              "classify"             
 [52] "clearance"             "click"                 "colFromCell"          
 [55] "colFromX"              "colorize"              "coltab"               
 [58] "coltab<-"              "combineGeoms"          "compare"              
 [61] "Compare"               "compareGeom"           "concats"              
 [64] "contour"               "convHull"              "costDist"             
 [67] "countNA"               "cover"                 "crds"                 
 [70] "crop"                  "crosstab"              "crs"                  
 [73] "crs<-"                 "datatype"              "deepcopy"             
 [76] "delaunay"              "densify"               "density"              
 [79] "depth"                 "depth<-"               "describe"             
 [82] "diff"                  "direction"             "disagg"               
 [85] "distance"              "dots"                  "draw"                 
 [88] "droplevels"            "elongate"              "emptyGeoms"           
 [91] "erase"                 "expanse"               "ext"                  
 [94] "ext<-"                 "extend"                "extract"              
 [97] "extractAlong"          "extractRange"          "fileBlocksize"        
[100] "fillHoles"             "fillTime"              "flip"                 
[103] "focal"                 "focal3D"               "focalCor"             
[106] "focalCpp"              "focalMat"              "focalPairs"           
[109] "focalReg"              "focalValues"           "forceCCW"             
[112] "free_RAM"              "freq"                  "gaps"                 
[115] "gdal"                  "gdalCache"             "geom"                 
[118] "geomtype"              "getGDALconfig"         "getTileExtents"       
[121] "global"                "graticule"             "gridDist"             
[124] "gridDistance"          "halo"                  "has.colors"           
[127] "has.RGB"               "has.time"              "hasMinMax"            
[130] "hasValues"             "head"                  "hist"                 
[133] "identical"             "ifel"                  "image"                
[136] "impose"                "inext"                 "init"                 
[139] "inMemory"              "inset"                 "interpIDW"            
[142] "interpNear"            "interpolate"           "intersect"            
[145] "is.bool"               "is.empty"              "is.factor"            
[148] "is.int"                "is.lines"              "is.lonlat"            
[151] "is.points"             "is.polygons"           "is.related"           
[154] "is.rotated"            "is.valid"              "isFALSE"              
[157] "isTRUE"                "k_means"               "lapp"                 
[160] "layerCor"              "levels"                "linearUnits"          
[163] "lines"                 "logic"                 "Logic"                
[166] "longnames"             "longnames<-"           "makeNodes"            
[169] "makeTiles"             "makeValid"             "makeVRT"              
[172] "map.pal"               "mask"                  "match"                
[175] "math"                  "Math"                  "Math2"                
[178] "mean"                  "median"                "mem_info"             
[181] "merge"                 "mergeLines"            "mergeTime"            
[184] "meta"                  "metags"                "metags<-"             
[187] "minCircle"             "minmax"                "minRect"              
[190] "modal"                 "mosaic"                "na.omit"              
[193] "NAflag"                "NAflag<-"              "names"                
[196] "ncell"                 "ncol"                  "ncol<-"               
[199] "nearby"                "nearest"               "nlyr"                 
[202] "nlyr<-"                "noNA"                  "normalize.longitude"  
[205] "north"                 "not.na"                "nrow"                 
[208] "nrow<-"                "nsrc"                  "origin"               
[211] "origin<-"              "pairs"                 "panel"                
[214] "patches"               "perim"                 "persp"                
[217] "plet"                  "plot"                  "plotRGB"              
[220] "points"                "polys"                 "prcomp"               
[223] "predict"               "princomp"              "project"              
[226] "quantile"              "query"                 "rangeFill"            
[229] "rapp"                  "rast"                  "rasterize"            
[232] "rasterizeGeom"         "rasterizeWin"          "rcl"                  
[235] "readRDS"               "readStart"             "readStop"             
[238] "readValues"            "rectify"               "regress"              
[241] "relate"                "removeDupNodes"        "res"                  
[244] "res<-"                 "resample"              "rescale"              
[247] "rev"                   "RGB"                   "RGB<-"                
[250] "roll"                  "rotate"                "round"                
[253] "rowColCombine"         "rowColFromCell"        "rowFromCell"          
[256] "rowFromY"              "same.crs"              "sapp"                 
[259] "saveRDS"               "sbar"                  "scale"                
[262] "scoff"                 "scoff<-"               "sds"                  
[265] "segregate"             "sel"                   "selectHighest"        
[268] "selectRange"           "serialize"             "set.cats"             
[271] "set.crs"               "set.ext"               "set.names"            
[274] "set.RGB"               "set.values"            "setGDALconfig"        
[277] "setMinMax"             "setValues"             "shade"                
[280] "sharedPaths"           "shift"                 "sieve"                
[283] "simplifyGeom"          "size"                  "snap"                 
[286] "sort"                  "sources"               "spatSample"           
[289] "spin"                  "split"                 "sprc"                 
[292] "stdev"                 "stretch"               "subset"               
[295] "subst"                 "summary"               "Summary"              
[298] "svc"                   "symdif"                "t"                    
[301] "tail"                  "tapp"                  "terrain"              
[304] "terraOptions"          "text"                  "tighten"              
[307] "time"                  "time<-"                "timeInfo"             
[310] "tmpFiles"              "trans"                 "trim"                 
[313] "union"                 "unique"                "units"                
[316] "units<-"               "unserialize"           "unwrap"               
[319] "update"                "values"                "values<-"             
[322] "varnames"              "varnames<-"            "vect"                 
[325] "vector_layers"         "viewshed"              "voronoi"              
[328] "vrt"                   "vrt_tiles"             "weighted.mean"        
[331] "where.max"             "where.min"             "which.lyr"            
[334] "which.max"             "which.min"             "width"                
[337] "window"                "window<-"              "wrap"                 
[340] "wrapCache"             "writeCDF"              "writeRaster"          
[343] "writeStart"            "writeStop"             "writeValues"          
[346] "writeVector"           "xapp"                  "xFromCell"            
[349] "xFromCol"              "xmax"                  "xmax<-"               
[352] "xmin"                  "xmin<-"                "xres"                 
[355] "xyFromCell"            "yFromCell"             "yFromRow"             
[358] "ymax"                  "ymax<-"                "ymin"                 
[361] "ymin<-"                "yres"                  "zonal"                
[364] "zoom"                 

Raster data

Create and define raster

r <- rast(ncol=10, nrow = 10)

r[] <- runif(n=ncell(r))

r
class       : SpatRaster 
dimensions  : 10, 10, 1  (nrow, ncol, nlyr)
resolution  : 36, 18  (x, y)
extent      : -180, 180, -90, 90  (xmin, xmax, ymin, ymax)
coord. ref. : lon/lat WGS 84 (CRS84) (OGC:CRS84) 
source(s)   : memory
name        :        lyr.1 
min value   : 0.0001255509 
max value   : 0.9881328584 

Raster data

plot(r)

Basic raster in R.

Raster data

We can access data from specific locations within a raster

# Access data from the ith location in a raster
r[12]
      lyr.1
1 0.7269309
# Access data based on row and column
r[2, 2] 
      lyr.1
1 0.7269309

Raster data

Rasters can be stacked which can make them very efficient to work with

# Create 2 new rasters based on raster r
r2 <- r * 50
r3 <- sqrt(r * 5)

# Stack rasters and rename to be unique
s <- c(r, r2, r3)
names(s) <- c('r1', 'r2', 'r3')

Raster data

plot(s)

Working with real data

  • There are several packages for accessing geospatial data from the web
  • We will use the FedData package, but numerous other packages exist to access data within and without the U.S.
  • One useful example is the elevatr package for accessing elevation data around the world

Working with real data

We will walk through an example of extracting information from a raster using a polygon layer. To do this we will:

  • Select just Corvallis among oregon_cities
  • Add a 10,000m buffer
  • Download National Elevation Data
  • Transform the projection system of the elevation data to match Corvallis
  • Calculate the average elevation within 10km of Corvallis

Working with real data

  1. Buffer Corvallis
library(FedData)

# Select just Corvallis and calculate a 10,000-m buffer
corvallis <- 
  oregon_cities %>%
  filter(cities == 'Corvallis') %>% 
  st_buffer(10000)

Working with real data

  1. Download NED based on Corvallis buffer
# Download national elevation data (ned)
ned <- FedData::get_ned(
  template = corvallis,
  label = "corvallis")
  1. Transform the CRS
ned <- terra::project(ned, 
                      'epsg:5070',
                      method = 'bilinear')

Working with real data

  1. Mean elevation within polygon
# zonal function in terra to calculate zonal statistics
terra::zonal(ned, 
             
             # Need to convert corvallis `sf` object to terra vector
             terra::vect(corvallis), 
             
             # Metric to be calculated
             mean, na.rm = T)
   Layer_1
1 119.6678

Your Turn

  1. Read in U.S. cities with data('us.cities') from the maps library
  2. Select the city of your choice and buffer it by 10Km. (We suggest converting to an equal area projection first)
  3. Read in National Elevation Data for your city with the FedData package
  4. Transform CRS of elevation data to match city
  5. Calculate the mean elevation within 10km of your city
08:00

Solution

1-2. Buffer city of your choice

library(maps)
data('us.cities')

my_city <- us.cities %>% 
  filter(name == 'Idaho Falls ID') %>% 
  st_as_sf(coords = c('long', 'lat'), crs = 4269) %>% 
  st_transform(crs = 5070) %>% 
  st_buffer(10000)

Solution

  1. Read in elevation data
ned <- FedData::get_ned(
  template = my_city,
  label = "Idaho Falls")

Solution

  1. Transform CRS
ned <- terra::project(ned, 
                      'epsg:5070',
                      method = 'bilinear')

Solution

  1. Calculate mean elevation within buffer
terra::zonal(ned, 
             terra::vect(my_city), 
             mean, na.rm = T)
  USGS_1_n44w112
1       1450.104

Watershed Delineation

  • Characterizing watersheds is fundamental to much of our work in freshwater science
  • Although it is more than we can cover in today’s workshop, we want you to be aware that there are several options for delineating watersheds in R
  • We’ll provide two examples of how to delineate watersheds within the conterminous U.S. using two online services

USGS StreamStats

The USGS’s StreamStats is an online service and map interface that allows users to navigate to a desired location and delineate a watershed boundary with the click of a mouse:

https://streamstats.usgs.gov/ss/

In addition to the map interface, the data are also accessible via an API:

https://streamstats.usgs.gov/docs/streamstatsservices

USGS StreamStats

It is also possible to replicate this functionality in R:

streamstats_ws = function(state, longitude, latitude){
  p1 = 'https://streamstats.usgs.gov/streamstatsservices/watershed.geojson?rcode='
  p2 = '&xlocation='
  p3 = '&ylocation='
  p4 = '&crs=4269&includeparameters=false&includeflowtypes=false&includefeatures=true&simplify=true'
  query <-  paste0(p1, state, p2, toString(longitude), p3, toString(latitude), p4)
  mydata <- jsonlite::fromJSON(query, simplifyVector = FALSE, simplifyDataFrame = FALSE)
  poly_geojsonsting <- jsonlite::toJSON(mydata$featurecollection[[2]]$feature, auto_unbox = TRUE)
  poly <- geojsonio::geojson_sf(poly_geojsonsting) 
  poly
}

# Define location for delineation (Calapooia Watershed)
state <- 'OR'
latitude <- 44.62892
longitude <- -123.13113

# Delineate watershed
cal_ws <- streamstats_ws('OR', longitude, latitude) %>% 
  st_transform(crs = 5070)

USGS StreamStats

nhdplusTools

  • nhdplusTools is an R package that can access the Network Linked Data Index (NLDI) service, which provides navigation and extraction of NHDPlus data: https://doi-usgs.github.io/nhdplusTools/
  • nhdplusTools includes network navigation options as well as watershed delineation
  • The delineation method differs from StreamStats (based on National Hydrography IDs)

nhdplusTools

library(nhdplusTools)

# Simple feature option to generate point without any other attributes
cal_pt <- st_sfc(st_point(c(longitude, latitude)), crs = 4269)

# Identify the network location (NHDPlus common ID or COMID)
start_comid <- nhdplusTools::discover_nhdplus_id(cal_pt)

# Combine info into list (required by NLDI basin function)
ws_source <- list(featureSource = "comid", featureID = start_comid)

cal_ws2 <- nhdplusTools::get_nldi_basin(nldi_feature = ws_source)

nhdplusTools

nhdplusTools

nhdplusTools works by walking a network of pre-staged sub-catchments with unique IDs (COMIDs)

Other watershed delination methods

  • Outside of the U.S., these tools are not available
  • It is still possible to delineate a custom watershed from raw DEM data with help from the whitebox R package
  • This book walks through this process
  • Jeff Hollister (U.S. EPA) developed an R package called elevatr that can access DEM data from the web both within and outside the U.S.

Your Turn

  1. Delineate the Logan River watershed in Utah at -111.855, 41.707
  2. Use mapview::mapview to plot watershed
08:00

Solution

  1. Delineate Logan River watershed
# Logan River Watershed
latitude <- 41.707
longitude <- -111.855

# Define the lat/lon
start_point <- st_sfc(st_point(c(longitude, latitude)), crs = 4269)
# Find COMID of this point
start_comid <- nhdplusTools::discover_nhdplus_id(start_point)
# Create a list object that defines the feature source and starting COMID
ws_source <- list(featureSource = "comid", featureID = start_comid)
# Delineate basin
logan_ws <- nhdplusTools::get_nldi_basin(nldi_feature = ws_source)

Solution

  1. Map the Logan River watershed
mapview::mapview(logan_ws)

StreamCatTools

Why StreamCat/LakeCat?

  • Examples covered are for single watersheds - but we often need data for hundreds or thousands
  • Overlaying watersheds onto numerous landscape rasters/layers is complicated/long process
  • StreamCat/LakeCat solves this issue by providing summarized data for all streams and lakes in the NHDPlus (medium resolution)

Why StreamCat/LakeCat?

Let’s revisit the Calapooia watershed and calculate percent row crop for 2019 with FedData

  • Download NLCD for 2019
nlcd <- get_nlcd(
  template = cal_ws2,
  year = 2019,
  label = 'Calapooia') %>%
  terra::project('epsg:5070', method = 'near') 

Why StreamCat/LakeCat?

  • Use terra to extract the watershed metrics
terra::extract(nlcd,
               terra::vect(cal_ws2)) %>%
  group_by(Class) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count)) * 100) %>%
  filter(Class == 'Cultivated Crops') %>%
  print()
# A tibble: 1 × 3
  Class             count percent
  <fct>             <int>   <dbl>
1 Cultivated Crops 112485    10.5

Why StreamCat/LakeCat?

Now let’s extract the same information using the StreamCatTools package to access the online StreamCat database

library(StreamCatTools)

comid <- '23763521'

sc_get_data(comid = comid,
            metric = 'PctCrop2019', 
            aoi = 'watershed')
# A tibble: 1 × 3
     COMID WSAREASQKM PCTCROP2019WS
     <dbl>      <dbl>         <dbl>
1 23763521       965.          10.5

StreamCatTools

  • StreamCat and the StreamCatTools package provide access to the same information with much less code because StreamCat pre-stages watershed metrics for each NHDPlus catchment

StreamCatTools

StreamCat also provides metrics at several scales

Figure 5: Geospatial framework of the StreamCat Dataset

StreamCatTools

sc_get_data(comid = comid,
            metric = 'PctCrop2019', 
            aoi = 'catchment,watershed')
# A tibble: 1 × 5
     COMID WSAREASQKM CATAREASQKM PCTCROP2019CAT PCTCROP2019WS
     <dbl>      <dbl>       <dbl>          <dbl>         <dbl>
1 23763521       965.        8.49           36.6          10.5

StreamCatTools

For a subset of watershed metrics, summaries within ~100m of the stream segment are also available for catchments and watersheds:

Figure 6: Riparian buffers (red) of NHD stream lines (white) and on-network NLCD water pixels (blue).

StreamCatTools

sc_get_data(comid = comid,
            metric = 'PctCrop2019', 
            aoi = 'catchment,riparian_catchment,watershed,riparian_watershed') %>% 
  as.data.frame()
     COMID WSAREASQKM WSAREASQKMRP100 CATAREASQKM CATAREASQKMRP100
1 23763521   964.6101          136.17      8.4933           1.1097
  PCTCROP2019CATRP100 PCTCROP2019WSRP100 PCTCROP2019CAT PCTCROP2019WS
1                9.08                8.4          36.59         10.49

StreamCatTools

StreamCatTools also makes it very easy to grab data for entire regions.

library(ggplot2)

iowa_crop <- sc_get_data(state = 'IA',
                         metric = 'PctCrop2019', 
                         aoi = 'watershed')

StreamCatTools

ggplot() + 
  geom_histogram(data = iowa_crop,
                 aes(x = PCTCROP2019WS)) + 
  theme_bw()

StreamCatTools

Histogram of crop as a % of watersheds within Iowa in 2019.

StreamCatTools

  • We can provide multiple metrics to the request by separating them with a comma.
  • Note that the request is provided as a single string, 'PctCrop2001, PctCrop2019', rather than a vector of metrics: c('PctCrop2001', 'PctCrop2019').
  • The request itself is agnostic to formatting of the text. For example, these requests will also work: 'pctcrop2001, pctcrop2019' or 'PCTCROP2001,PCTCROP2019'.

StreamCatTools

StreamCat contains hundreds of metrics and we recommend consulting the metric list to identify those of interest for your study.

Accessing LakeCat

The R function to access LakeCat data was designed to parallel StreamCat functions

Let’s walk through an example together that:

  • Define a sf object of a sample point at Pelican Lake, WI
  • Obtain the lake waterbody polygon
  • Extract the COMID (unique ID) to query LakeCat
  • Pull data on mean watershed elevation, calcium oxide content of the geology, % sand and organic matter content of soils, and % of the watershed composed of deciduous forest

Accessing LakeCat

library(nhdplusTools)

# Pelican Lake, WI
latitude <- 45.502840
longitude <- -89.198694

pelican_pt <- data.frame(site_name = 'Pelican Lake',
                         latitude = latitude,
                         longitude = longitude) %>% 
  st_as_sf(coords = c('longitude', 'latitude'), crs = 4326)

pelican_lake <- nhdplusTools::get_waterbodies(pelican_pt) 

comid <- pelican_lake %>% 
  pull(comid)

lc_get_data(metric = 'elev, cao, sand, om, pctdecid2019',
            aoi = 'watershed',
            comid = comid)
# A tibble: 1 × 7
      COMID WSAREASQKM SANDWS ELEVWS CAOWS PCTDECID2019WS  OMWS
      <dbl>      <dbl>  <dbl>  <dbl> <dbl>          <dbl> <dbl>
1 167120863       41.0   59.6   491.  4.81           14.7  16.7

Your Turn

Query LakeCat to determine if the basins of these lakes had more deciduous or conifer forest in 2019:

“9028333,9025609,9025611,9025635”

The LakeCat metrics page may help

05:00

Solution

library(StreamCatTools)

comids <- "9028333,9025609,9025611,9025635"

lc_get_data(metric = 'pctdecid2019, pctconif2019',
            aoi = 'watershed',
            comid = comids) 
# A tibble: 4 × 4
    COMID WSAREASQKM PCTDECID2019WS PCTCONIF2019WS
    <dbl>      <dbl>          <dbl>          <dbl>
1 9025609      5.27            18.1          0    
2 9025611      0.447           19.3          0    
3 9025635     13.6             22.3          0.339
4 9028333      6.62            34.5          2.32 

Bringing it together

  • To finish the workshop, we will walk through 2 examples of using spmodel to model phenomena in two types of freshwaters:
    • Electrical conductivity of lake water across Minnesota
    • The distribution of Argia damselfly across several northeastern states

Lake Conductivity

  • Conductivity is an important water quality measure and one of growing concern due to salinization of freshwater (Cañedo-Argüelles et al. 2016, Kaushal et al. 2021)
  • This analysis is based on a recent paper by Michael Dumelle and others published in the journal Spatial Statistics. The GitHub repository for this paper is also available

Data Prep: Conductivity (Dependent) Data

Load required packages…

library(tidyverse)
library(sf)
library(tigris)
library(StreamCatTools)
library(spmodel)
library(data.table)

Data Prep: Conductivity (Dependent) Data

Read and prep table of lake conductivity values…

# Read in states to give some context
states <- tigris::states(cb = TRUE, progress_bar = FALSE)  %>% 
  filter(!STUSPS %in% c('HI', 'PR', 'AK', 'MP', 'GU', 'AS', 'VI'))  %>% 
  st_transform(crs = 5070)

# Read in lakes, select/massage columns, convert to spatial object
lake_cond <- fread('nla_obs.csv') %>% 
  select(UNIQUE_ID, COMID, COND_RESULT,
         AREA_HA, DSGN_CYCLE,
         XCOORD, YCOORD) %>% 
  mutate(DSGN_CYCLE = factor(DSGN_CYCLE)) %>% 
  st_as_sf(coords = c('XCOORD', 'YCOORD'), 
           crs = "+proj=aea +lat_0=37.5 +lon_0=-96 +lat_1=29.5 +lat_2=45.5 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs +type=crs") %>% 
  st_transform(crs = 5070)

Data Prep: Conductivity (Dependent) Data

ggplot() +
  geom_sf(data = states,
          fill = NA) +
  geom_sf(data = lake_cond,
          aes(color = DSGN_CYCLE)) +
  scale_color_manual(values=c("#a6cee3", "#1f78b4", "#b2df8a")) +
  theme_bw() +
  theme(legend.position="bottom") 

Data Prep: Conductivity (Dependent) Data

NLA lake sampes across the U.S.

Data Prep: Conductivity (Dependent) Data

Select sample sites within Minnesota

MN <- states %>% 
  filter(STUSPS == 'MN')

cond_mn <- lake_cond %>% 
  st_filter(MN) %>% 
  rename(year = DSGN_CYCLE)

Data Prep: Conductivity (Dependent) Data

ggplot() +
  geom_sf(data = MN,
          fill = NA) +
  geom_sf(data = cond_mn,
          aes(color = log(COND_RESULT))) +
  scale_color_distiller(palette = 'YlOrRd', direction = 1) +
  theme_bw() +
  theme(legend.position="bottom") 

Data Prep: Conductivity (Dependent) Data

Distribution of conductivity values.

Data Prep: LakeCat (Independent) Data

We will use the similar watershed predictors as Dumelle et al. (2023).

Already included with data table with the response variable are:

  • Lake Area (AREA_HA)
  • Sample year (DSGN_CYCLE)

Data Prep: LakeCat (Independent) Data

From LakeCat, we also will get the following predictor variables:

  • Local long-term air temperature
  • Long-term watershed precipitation
  • Sulfur content of underlying lithology

Data Prep: LakeCat (Independent) Data

comids <- cond_mn$COMID

mn_lakecat <- lc_get_data(comid = comids,
                          metric = 'Tmean8110, Precip8110, S') %>% 
  select(COMID, TMEAN8110CAT, PRECIP8110WS, SWS)

mn_lakecat
# A tibble: 115 × 4
     COMID TMEAN8110CAT PRECIP8110WS    SWS
     <dbl>        <dbl>        <dbl>  <dbl>
 1 1099282         7.69         825. 0.0365
 2 1101812         7.91         812. 0.0329
 3 1768378         3.51         696. 0.0985
 4 1775870         4.28         749. 0.272 
 5 1776124         4.43         778. 0.272 
 6 2032039         7.23         794. 0.0712
 7 2034063         7.25         845. 0.0712
 8 2262633         5.58         768. 0.0712
 9 2262729         5.62         786. 0.0329
10 2350690         6.71         735. 0.0712
# ℹ 105 more rows

Data Prep: LakeCat (Independent) Data

In addition to these static LakeCat data, we would also like to pull in data from specific years of NLCD to match sample years for % of watershed composed of crop area

To do this we will need to:

  • Grab LakeCat NLCD % crop data for years 2006, 2011, 2016
  • Clean and pivot the columns and add a year column
  • Add 1 to each year since available NLCD are 1 year behind field samples

Data Prep: LakeCat (Independent) Data

crop <- 
  
  # Grab LakeCat crop data
  lc_get_data(comid = comids,
              aoi = 'watershed',
              metric = 'pctcrop2006, pctcrop2011, pctcrop2016') %>% 
  
  # Remove watershed area from data
  select(-WSAREASQKM) %>% 
  
  # Pivot table to long to create "year" column
  pivot_longer(!COMID, names_to = 'tmpcol', values_to = 'PCTCROPWS') %>% 
  
  # Remove PCTCROP and WS to make "year" column
  mutate(year = as.integer(
    str_replace_all(tmpcol, 'PCTCROP|WS', ''))) %>% 
  
  # Add 1 to each year to match NLA years
  mutate(year = factor(year + 1)) %>% 
  
  # Remove the tmp column
  select(-tmpcol)

Data Prep: LakeCat (Independent) Data

Now, join the tables to make our model data

model_data <- cond_mn %>% 
  left_join(mn_lakecat, join_by(COMID)) %>% 
  left_join(crop, join_by(COMID, year))

Modeling lake conductivity

Define the formula

formula <- 
  log(COND_RESULT) ~ 
  AREA_HA + 
  year + 
  TMEAN8110CAT +
  PRECIP8110WS + 
  PCTCROPWS + 
  SWS

Modeling lake conductivity

Run a non-spatial and spatial model for comparison

cond_mod <- splm(formula = formula,
                 data = model_data,
                 spcov_type = 'none')

cond_spmod <- splm(formula = formula,
                   data = model_data,
                   spcov_type = 'exponential')

summary(cond_spmod)

Call:
splm(formula = formula, data = model_data, spcov_type = "exponential")

Residuals:
     Min       1Q   Median       3Q      Max 
-1.87192 -0.19602  0.02576  0.29597  1.28615 

Coefficients (fixed):
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)   8.486e+00  8.813e-01   9.630  < 2e-16 ***
AREA_HA       2.394e-05  5.510e-05   0.435   0.6639    
year2012     -1.270e-01  3.261e-02  -3.896 9.80e-05 ***
year2017     -9.871e-02  3.851e-02  -2.563   0.0104 *  
TMEAN8110CAT  5.375e-01  6.740e-02   7.976 1.55e-15 ***
PRECIP8110WS -8.406e-03  1.313e-03  -6.404 1.51e-10 ***
PCTCROPWS     4.317e-03  2.767e-03   1.560   0.1188    
SWS           1.110e+00  7.740e-01   1.435   0.1514    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pseudo R-squared: 0.5024

Coefficients (exponential spatial covariance):
       de        ie     range 
2.685e-01 1.371e-02 2.167e+04 

Modeling lake conductivity

Model comparison

glances(cond_mod, cond_spmod)
# A tibble: 2 × 10
  model         n     p  npar value   AIC  AICc logLik deviance pseudo.r.squared
  <chr>     <int> <dbl> <int> <dbl> <dbl> <dbl>  <dbl>    <dbl>            <dbl>
1 cond_spm…   162     8     3  161.  167.  167.  -80.5     154.            0.502
2 cond_mod    162     8     1  284.  286.  286. -142.      154             0.775

Modeling lake conductivity

Leave-one-out comparison

prd_mod <- spmodel::loocv(cond_mod, se.fit = TRUE, cv_predict = TRUE) 

prd_spmod <- spmodel::loocv(cond_spmod, se.fit = TRUE, cv_predict = TRUE)

rbind(prd_mod %>% pluck('stats'),
      prd_spmod %>% pluck('stats'))
# A tibble: 2 × 4
      bias  MSPE RMSPE  cor2
     <dbl> <dbl> <dbl> <dbl>
1 -0.00261 0.270 0.519 0.731
2 -0.00392 0.140 0.374 0.860

Map predicted values and standard errors

Join back to model data for mapping

# Combine predictions with model data (spatial points)
model_data <-
  model_data %>% 
  mutate(prd_cond = prd_spmod %>% 
           pluck('cv_predict'), 
         se_fit = prd_spmod %>% 
           pluck('se.fit'))

Map predicted values and standard errors

ggplot() +
  geom_sf(data = MN,
          fill = NA) +
  geom_sf(data = model_data,
          aes(color = prd_cond)) +
  scale_color_distiller(palette = 'YlOrRd', direction = 1) +
  theme_bw() +
  theme(legend.position="bottom") 

Map predicted values and standard errors

Predicted log(conductivity) values.

Map predicted values and standard errors

ggplot() +
  geom_sf(data = MN,
          fill = NA) +
  geom_sf(data = model_data,
          aes(color = se_fit)) +
  scale_color_distiller(palette = 'Reds', direction = 1) +
  theme_bw() +
  theme(legend.position="bottom") 

Map predicted values and standard errors

Model standard errors.